#######################
# 2. Comparing donors and non-donors, predicted and true CFscores
#######################


# load packages
library(randomForest)
library(dplyr)
library(missForest)
library(haven)

# load data
load('1_output.RData')

###

# 1. Compare donors and non-donors in CCES (Table 1)

# create variable indicating donors
cces12$donor <- ifelse(complete.cases(cces12$true_CFscore), 1, 0)

# create demographic variables
cces12$age <- 2012 - cces12$birthyr
cces12$over100k <- ifelse(cces12$faminc < 10, 0,
                          ifelse(cces12$faminc < 97, 1, NA))
cces12$bach <- ifelse(cces12$educ >= 5, 1, 0)

# create political variables
cces12$moderate <- ifelse(cces12$ideo5 == 3, 1, ifelse(cces12$ideo5 > 5, NA, 0))
cces12$strong_partisan <- ifelse(cces12$pid7 == 1 | cces12$pid7 == 7, 1, 0)
cces12$follows_news <- ifelse(cces12$newsint == 1, 1, 0)
cces12$quiz_score <- ifelse(cces12$CC309a == 1 & cces12$CC309b == 2, 1, 0)

# compare groups
t.test(age ~ donor, data = cces12)
t.test(over100k ~ donor, data = cces12)
t.test(bach ~ donor, data = cces12)
t.test(moderate ~ donor, data = cces12)
t.test(strong_partisan ~ donor, data = cces12)
t.test(follows_news ~ donor, data = cces12)
t.test(quiz_score ~ donor, data = cces12)


###

# 2. Compare variable importance

# a. OLS-based analysis

# Set ideological placement variable
cces12$ideo_placement <- ifelse(cces12$ideo5 < 6, cces12$ideo5, NA)

# Create formula for OLS analysis
temp <- paste(policies, collapse=" + ") 
formula_ols <- as.formula(paste('ideo_placement', temp, sep = ' ~ '))

# Regress self-placement on issue positions for donors and nondonors
fit.donors <- lm(formula_ols, data = cces12[cces12$donor == 1,])
fit.nondonors <- lm(formula_ols, data = cces12[cces12$donor == 0,])

# Correlate regression coefficients
cor(fit.donors$coefficients, fit.nondonors$coefficients)


# b. Random forest variable importance analysis (Table 2)

# Create formula to explain CFscores with issues
temp <- paste(policy_chars, collapse=" + ") 
formula_cf_issues <- as.formula(paste('true_CFscore', temp, sep = ' ~ '))

# Create formula to explain self-placements with issues
formula_placement_issues <- as.formula(paste('ideo_placement', temp, sep = ' ~ '))

# Run random forest models
set.seed(1776)
rf_cf_importance <- randomForest(formula_cf_issues, data = cces12[cces12$donor == 1,], ntree = 1000, nodesize = 10, mtry = 2, na.action = na.omit)
set.seed(1776)
rf_donorplace_importance <- randomForest(formula_placement_issues, data = cces12[cces12$donor == 1,], ntree = 1000, nodesize = 10, mtry = 2, na.action = na.omit)
set.seed(1776) # takes 10 minutes, reduce ntree or increase nodesize for approx. results
rf_nondonorplace_importance <- randomForest(formula_placement_issues, data = cces12[cces12$donor == 0,], ntree = 1000, nodesize = 10, mtry = 2, na.action = na.omit)

# Compare variable importance vectors
cor.test(~ rf_cf_importance$importance + rf_donorplace_importance$importance) # 0.95 (.90 chars)
cor.test(~ rf_cf_importance$importance + rf_nondonorplace_importance$importance) # 0.83 (.79 chars)
cor.test(~ rf_donorplace_importance$importance + rf_nondonorplace_importance$importance) # 0.93 (.91 chars)


###

# 3. Do issue positions predict CFscores?

# a. Prepare data

# Create donors dataset
cces_donors <- cces12[complete.cases(cces12$true_CFscore),]

# Separate training and testing data
set.seed(1986) # LGM
sample <- sample(1:NROW(cces_donors), round(NROW(cces_donors)*.50), replace=F) # draw 50% as training
training <- cces_donors[sample,] # create training and testing sets
testing <- cces_donors[-sample,]


# b. Train model on training data, predict values for testing data

# Create model formula
temp <- paste(policy_chars, collapse=" + ")
formula_full <- as.formula(paste("true_CFscore", temp, sep = " ~ "))

# Run model on training data
set.seed(1776)
train.fit <- randomForest(formula_full,
                    data=training,ntree=1000,nodesize=10,mtry = 2,
                    na.action = na.omit,keep.inbag=T)

# Predict values for testing data
testing$pred_CFscore <- predict(train.fit, newdata = testing)


# c. Compare true and predicted values

# Figure 2
colors <- c('dodgerblue','gold','red')
pch <- c(1,2,19)
plot(testing$pred_CFscore, testing$true_CFscore, 
     pch = pch[factor(testing$party_id)],
     col = colors[factor(testing$party_id)],
     xlab = 'Predicted CFscore',
     ylab = 'True CFscore',
     main = 'Figure 2. Random forest predictions of donor CFscores (r = 0.94)',
     cex.lab=1.5, cex.axis=1.5, cex.main=1.5, cex.sub=1.5)
legend('topleft', legend = c('Democrats','Independents','Republicans'), pch = pch, col = colors, cex = 1.2)
abline(lm(testing$true_CFscore ~ testing$pred_CFscore), lty = 2)
rug(testing$pred_CFscore, side = 1, ticksize = .02)


# Calculate correlations
cor.test(~ testing$pred_CFscore + testing$true_CFscore)
cor.test(~ testing$pred_CFscore[testing$party_id == "Democrat"] + testing$true_CFscore[testing$party_id == "Democrat"])
cor.test(~ testing$pred_CFscore[testing$party_id == "Independent"] + testing$true_CFscore[testing$party_id == "Independent"])
cor.test(~ testing$pred_CFscore[testing$party_id == "Republican"] + testing$true_CFscore[testing$party_id == "Republican"])
cor.test(~ testing$pred_CFscore[testing$true_CFscore < 1 & testing$true_CFscore > -1] + 
           testing$true_CFscore[testing$true_CFscore < 1 & testing$true_CFscore > -1]) # 0.82 among moderates

###

# 4. Clean up
rm(list=setdiff(ls(),''))

